clear
*set mem 600m
set more off
set logtype text
set matsize 800
cap program drop _all

cap log close

/* Calculate average value of farm property, farm equipment, and livestock in the US for 1850, 1860 and 1870. Use data from
Haines and ICPSR for this.
*/

use "$histdir\ICPSR_1850_stct.dta", clear
gen val_per_farm = farmval/farms
gen equip_per_farm = equipval/farms
gen lvstk_per_farm = livstock/farms

summ val_per_farm if state==100 & county==0
local val_per_farm_50 = r(mean)
summ equip_per_farm if state==100 & county==0
local equip_per_farm_50 = r(mean)
summ lvstk_per_farm if state==100 & county==0
local lvstk_per_farm_50 = r(mean)

use "$histdir/icpsr_1860_stct.dta", clear
gen farms = farm39 + farm1019 + farm2049 + farm5099 + farm100 + farm500 + farm1000
gen val_per_farm = farmval/farms
gen equip_per_farm = equipval/farms
gen lvstk_per_farm = livstock/farms

summ val_per_farm if state==100 & county==0
local val_per_farm_60 = r(mean)
summ equip_per_farm if state==100 & county==0
local equip_per_farm_60 = r(mean)
summ lvstk_per_farm if state==100 & county==0
local lvstk_per_farm_60 = r(mean)

use "$histdir/icpsr_1870_stct.dta", clear
gen val_per_farm = farmval/farms
gen equip_per_farm = equipval/farms
gen lvstk_per_farm = livstock/farms

summ val_per_farm if state==100 & county==0
local val_per_farm_70 = r(mean)
summ equip_per_farm if state==100 & county==0
local equip_per_farm_70 = r(mean)
summ lvstk_per_farm if state==100 & county==0
local lvstk_per_farm_70 = r(mean)


use "$rawdir/1850_1%.dta", clear

keep if sex==1 & age>=15 & occscore!=0 & race==1
xtile occscore_pctile = occscore, nq(100)

count
local N50 = r(N)
di `N50'

count if occscore_pctile==1
local frac50_1 = round((r(N)/`N50')*100)
di `frac50_1'
local lo50_1 = 1
local hi50_1 = `frac50_1'
di `hi50_1'
	
forvalues i=2/100 {
	local j = `i'-1
	count if occscore_pctile==`i'
	local frac50_`i' = round((r(N)/`N50')*100)
	
	if `frac50_`i'' > 0 {
		local lo50_`i' = `hi50_`j'' + 1
		local hi50_`i' = `hi50_`j'' + `frac50_`i''
	}
	else {
		local lo50_`i' = 0
		local hi50_`i' = `hi50_`j''
	}

}

forvalues i=1/100 {
	di "lo `i' = `lo50_`i''"
	di "hi `i' = `hi50_`i''"

}

use "$rawdir/1860_1%.dta", clear

keep if sex==1 & age>=15 & occscore!=0 & race==1
xtile occscore_pctile = occscore, nq(100)

count
local N60 = r(N)
di `N60'

count if occscore_pctile==1
local frac60_1 = round((r(N)/`N60')*100)
di `frac60_1'
local lo60_1 = 1
local hi60_1 = `frac60_1'
di `hi60_1'
	
forvalues i=2/100 {
	local j = `i'-1
	count if occscore_pctile==`i'
	local frac60_`i' = round((r(N)/`N60')*100)
	
	if `frac60_`i'' > 0 {
		local lo60_`i' = `hi60_`j'' + 1
		local hi60_`i' = `hi60_`j'' + `frac60_`i''
	}
	else {
		local lo60_`i' = 0
		local hi60_`i' = `hi60_`j''
	}

}

forvalues i=1/100 {
	di "lo `i' = `lo60_`i''"
	di "hi `i' = `hi60_`i''"

}

local s1850 "50"
local s1860 "60"
local s1870 "70"
local s1880 "80"
local s1900 "00"
local s1910 "10"
local s1920 "20"
	
	
	
foreach y in 1850 1860 1870 1880 1900 1910 1920 {

	use "$rawdir/`y'_1%.dta", clear
	cap drop _merge
	
	merge m:1 occ1950 statefip using "$tempdir\occ_wage_bystate.dta", keepus(occwage1900)
	drop if _merge == 2
	drop _merge
	
	sort occ1950
	by occ1950: egen hold = mean(occwage1900)
	replace occwage1900 = hold if statefip == 11 & occ1950 != 100 & occ1950 != . & occ1950 < 980 
	drop hold 
	
	sort age statefip sex race region occ1950 ind1950
	merge m:1 age statefip sex race region occ1950 ind1950 using "${altdir}\lido_score_1950_public_use.dta"
	drop if _merge == 2
	drop _merge
	
	sort datanum serial pernum
	
	gen x = occscore if sex==1 & age>=15 & occscore!=0 & race==1
	xtile occpctile = x, nq(20)
	drop x
	
	gen x = occwage1900 if sex==1 & age>=15 & occwage1900!=. & race==1
	xtile occpctile1900 = x, nq(20)
	drop x
	
	gen occpctile_50dist = .
	gen occpctile_60dist = .
	

	forvalues i=1/100 {
	
		di "testing `lo50_`i''"
	
		if `lo50_`i''>0 {
			replace occpctile_50dist = `i' if occpctile>=`lo50_`i'' & occpctile<=`hi50_`i''
		}
		if `lo60_`i''>0 {
			replace occpctile_60dist = `i' if occpctile>=`lo60_`i'' & occpctile<=`hi60_`i''
		}
		
	}
	
	foreach var of varlist occpctile* {
		replace `var' = `var'/100
	}
	
	*** this do-file generates the variables for the fathers in 1850

	assert region>=10
	gen south=floor(region/10)==3
	gen midwest = floor(region/10)==2
	gen northeast = floor(region/10)==1
	gen west = floor(region/10)==4
	tab south
	tab midwest
	tab northeast
	tab west

	gen region_birth = 11 if bpl==9 | bpl==23 | bpl==25 | bpl==33 | bpl==44 | bpl==50
	replace region_birth = 12 if bpl==36 | bpl==34 | bpl==42
	replace region_birth = 21 if bpl==17 | bpl==18 | bpl==26 | bpl==39 | bpl==55
	replace region_birth = 22 if bpl==19 | bpl==20 | bpl==27 | bpl==29 | bpl==31 | bpl==38 | bpl==46
	replace region_birth = 31 if bpl== 10 | bpl==11 | bpl==24 | bpl==12 | bpl==13 | bpl==37 | bpl==45 | bpl==51 | bpl==54
	replace region_birth = 32 if bpl==1 | bpl==21 | bpl==28 | bpl==47
	replace region_birth = 33 if bpl==5 | bpl==22 | bpl==40 | bpl==48
	replace region_birth = 41 if bpl==4 | bpl==8 | bpl==16 | bpl==32 | bpl==30 | bpl==35 | bpl==56 | bpl==49
	replace region_birth = 42 if bpl==6 | bpl==41 | bpl==53
		
	gen born_south=floor(region_birth/10)==3
	gen born_midwest = floor(region_birth/10)==2
	gen born_northeast = floor(region_birth/10)==1
	gen born_west = floor(region_birth/10)==4
	gen immig = bpl>100
	
	*** drop non-whites
	keep if race==1

	*** keep families with children between 0 to 15
	gen tag=1 if age<=15 & relate==3
	egen fam_keep=mean(tag), by (serial)
	keep if fam_keep==1
	drop tag fam_keep

	*** keep families where head of household is male. 1486 female HH 
	gen tag=1 if relate==1 & sex==1
	egen male_HH=mean(tag), by(serial)
	keep if male_HH==1
	drop tag male_HH

	*** keep dad and mom and children 
	keep if relate==3| relate==2 | relate==1
	
	*save information about children's siblings
	
	gen x=1 if relate==3 & sex==1
	egen num_bro=total(x) if relate==3, by(serial) 
	drop x
	
	gen x=1 if relate==3 & sex==2
	egen num_sis=total(x) if relate==3, by(serial)
	drop x
	
	gen num_sib = num_bro + num_sis
	
	egen age_rank=rank(age) if relate==3, by(serial relate) field
	egen x=rank(age) if relate==3 & sex==1, by(serial relate sex) field
	egen y=rank(age) if relate==3 & sex==2, by(serial relate sex) field
	gen age_rank_ownsex=x if sex==1
	replace age_rank_ownsex=y if sex==2
	drop x y
	
	rename age_rank age_rank_child_`s`y''
	rename age_rank_ownsex age_rank_ownsex_child_`s`y''
	
	*** delete kids who are over 16
	drop if relate==3 & age>=16

	*** literacy 
	gen byte literacy=lit==4
	lab var literacy "literate=1"

	gen byte x=literacy if relate==1
	egen lit_`s`y''_father=mean(x), by(serial)
	drop x
	
	gen byte x=literacy if relate==2 & sex==2
	egen lit_`s`y''_mother = mean(x), by(serial)
	drop x

	*** age
	gen int x=age if relate==1 
	egen age_`s`y''_father=mean(x), by(serial)
	drop x
	
	gen int age_`s`y''=age if relate==3

	gen int x=age if sex==2 & relate==2
	egen age_`s`y''_mother=mean(x), by(serial)
	drop x
	
	gen ageatbirth_`s`y''_father = age_`s`y''_father - age
	gen ageatbirth_`s`y''_mother = age_`s`y''_mother - age
	
	*** occscore
	gen x=occscore if relate==1
	egen occscore_`s`y''_father=mean(x), by(serial)
	drop x
	
	replace occscore_`s`y''_father=. if occscore_`s`y''_father==0
	
	gen x=lido if relate==1
	egen occlido_`s`y''_father=mean(x), by(serial)
	drop x
	
	gen x = occpctile if relate==1
	egen occpctile_owndist_`s`y''_father = mean(x), by(serial)
	drop x
	
	gen x = occpctile_50dist if relate==1
	egen occpctile_50dist_`s`y''_father = mean(x), by(serial)
	drop x
	
	gen x = occpctile_60dist if relate==1
	egen occpctile_60dist_`s`y''_father = mean(x), by(serial)
	drop x
	
	gen x = occwage1900 if relate==1
	egen occwage1900_`s`y''_father = mean(x), by(serial)
	drop x
	
	*** sei
	gen x=sei if relate==1
	egen sei_`s`y''_father=mean(x), by(serial)
	drop x

	*** presgl
	gen x=presgl if relate==1
	egen presgl_`s`y''_father=mean(x), by(serial)
	drop x

	*** erscor
	gen x=erscor50 if relate==1
	egen erscor_`s`y''_father=mean(x), by(serial)
	drop x

	*** edscor
	gen x=edscor50 if relate==1
	egen edscor_`s`y''_father=mean(x), by(serial)
	drop x

	*** npboss
	gen x=npboss50 if relate==1
	egen npboss_`s`y''_father=mean(x), by(serial)
	drop x

	*** replace 9999 with .
	foreach var of varlist occscore_`s`y''_father sei_`s`y''_father presgl_`s`y''_father erscor_`s`y''_father edscor_`s`y''_father npboss_`s`y''_father {
		replace `var'=. if `var'==9999
	}

	*** occ1950
	gen x = occ1950 if relate==1
	egen occ_`s`y''_father=mean(x), by(serial)
	drop x
	
	gen x=occscore if relate==1 & occ1950!=100
	egen occscorenf_`s`y''_father=mean(x), by(serial)
	drop x
	
	*** Wealth
	
	
/*Adjust farmers' wealth. The reason for this is that much of a farmer's weatlh was used to conduct business, not for
consumption. So this is not exactly a great measure of labour income for farmers. As it turns out, if we don't do this adjustment, 
farmers end up way in the upper tail of the occupational distribution, and we really don't think this is right (for the reasons
just mentioned). So we try to subtract "business related" property from farmers' property, by subtracting the value of farm 
property (from farm owners' wealth, or farmers who report positive real estate wealth) and subtracitng the value of 
equipment and livestock from all farmers' wealth (under the assumption that tenant farmers used their own equipment).
*/


	
	if "`s`y''"=="50" | "`s`y''"=="60" | "`s`y''"=="70" {
		gen x = realprop if relate==1
		egen realprop_`s`y''_father = mean(x), by(serial)
		drop x
		gen logrealprop_`s`y''_father = log(realprop_`s`y''_father)
		
		gen realprop_adj_`s`y''_father = realprop_`s`y''_father
		replace realprop_adj_`s`y''_father = max(realprop_adj_`s`y''_father - `val_per_farm_`s`y''',0) if occ1950==100 & realprop_adj_`s`y''_father>0
		gen logrealprop_adj_`s`y''_father = log(realprop_adj_`s`y''_father)
	}
	
	if "`s`y''" == "60" | "`s`y''"=="70" {
		gen x = realprop + persprop if relate==1
		egen totalprop_`s`y''_father = mean(x), by(serial)
		drop x
		gen logtotalprop_`s`y''_father = log(totalprop_`s`y''_father)
		
		gen x = persprop if relate==1
		egen persprop_`s`y''_father = mean(x), by(serial)
		drop x
		gen logpersprop_`s`y''_father = log(persprop_`s`y''_father)
		
		gen persprop_adj_`s`y''_father = persprop_`s`y''_father
		replace persprop_adj_`s`y''_father = max(persprop_adj_`s`y''_father - `equip_per_farm_`s`y''' - `lvstk_per_farm_`s`y''',0) if occ1950==100 & persprop_adj_`s`y''_father>0
		gen logpersprop_adj_`s`y''_father = log(persprop_adj_`s`y''_father)

	}
	
	**** wages from PH book
	cap drop _merge
	sort occ1950
	merge occ1950 using $altdir\occ_wage_PH.dta, keep(occwage_PH_all* q*) nok
	
	gen x1=occwage_PH_all if relate==1
	gen x2=occwage_PH_all if relate==1 & occ1950~=100
	gen x3=occwage_PH_all_imp if relate==1

	egen occwage_PHall_`s`y''_father=mean(x1), by(serial)
	egen occwage_PHnofarm_`s`y''_father=mean(x2), by(serial)	
	egen occwage_PHimpfarm_`s`y''_father=mean(x3), by(serial)
	drop x1 x2 x3

	*assigning farmers a wage of an owner-occupier from boustan et al paper
	gen x=occwage_PH_all_imp if relate==1
	replace x=576 if x~=. & occ1950==100
	egen occwage_farmown_`s`y''_father=mean(x), by(serial)
	drop x
	*assiging farmers a wage of farm laborer
	gen x=occwage_PH_all_imp if relate==1
	replace x=257.5647 if x!=. & occ1950==100
	egen occwage_farmlab_`s`y''_father=mean(x), by(serial)
	drop x


	drop _merge
	
	sort occ1950
	merge occ1950 using "$tempdir/occ_wealth.dta", keep(mean_weal*) nok
	
	foreach var in pers padj padj2 pxsth real radj rfull rfadj r50 r50adj r60 r60adj{
		gen x = mean_weal_`var' if relate==1
		egen occweal_`var'_`s`y''_father = mean(x), by(serial)
		drop x
		
	}
	
	drop _merge
	

	***Log wages
	
	foreach var of varlist occscore* occwage* occweal* occlido*{
		gen log`var'=log(`var')
	}
	
	*** foreign born
	gen x = bpl if relate==1
	egen bpl_`s`y''_father = mean(x), by(serial)
	gen byte foreign_`s`y''_father = bpl_`s`y''_father >= 100
	drop x 

	gen x=bpl if sex==2 & relate==2
	egen bpl_`s`y''_mother = mean(x), by(serial)
	gen byte foreign_`s`y''_mother = bpl_`s`y''_mother>=100
	drop x
	
	gen foreign_`s`y''= bpl>=100 & relate==3

	compress

	*** NAMES
	*** first name 
	gen str first=word(namefrst,1)
	replace first=subinstr(first, ".", "", .)
	replace first=trim(first)
	replace first=proper(first)
	*** middle name
	gen str middle=word(namefrst,2)
	replace middle=proper(middle)
	
	gen init = "X" if middle!=""
	
	*** obvious abbreviations (not nicknames)
	replace first="William" if first=="Wm"
	replace first="George" if first=="Geo"
	replace first="Charles" if first=="Chas"
	replace first="Daniel" if first=="Danl"
	replace first="James" if first=="Jas"
	replace first="Joseph" if first=="Jos"
	replace first="Robert" if first=="Robt"
	replace first="Richard" if first=="Richd"
	replace first="Samuel" if first=="Saml"
	replace first="Thomas" if first=="Thos"
	replace first="Frederick" if first=="Fredk"
	replace first="Frederick" if first=="Fred'K" 
	replace first="John" if first=="Jno" 
	replace first="Samuel" if first=="Sam'L"  
	replace first="Thomas" if first=="Tho" 
	replace first="Michael" if first=="Michl"

	gen noname=(strpos(first, "?")>0 | strpos(first, "%")>0 | strpos(first, "-")>0 | strpos(first, "!")>0 | strpos(first, "*")>0)
	replace noname=1 if missing(first)
	sort first
	save "$tempdir/`y'_fathers.dta", replace
	
	merge first using "$tempdir/gender_miscode.dta", keep(male_drop fem_drop) 
	drop if _merge==2
	replace noname=1 if (sex==1 & male_drop==1) | (sex==2 & fem_drop==1)
	tab noname if sex==1 & relate==3
	tab noname if sex==2 & relate==3
	drop _merge
	
	gen x=noname if sex==1 & relate==1
	egen noname_father=mean(x), by(serial)
	drop x
	
	gen x=noname if sex==2 & relate==2
	egen noname_mother = mean(x), by(serial)
	drop x
	
	gen firstmiddle = first+ init

	gen first_father=first if sex==1 & relate==1
	gen firstmiddle_father = firstmiddle if sex==1 & relate==1
	sort serial pernum
	replace first_father=first_father[_n-1] if serial==serial[_n-1] & first_father==""
	replace firstmiddle_father = firstmiddle_father[_n-1] if serial==serial[_n-1] & firstmiddle_father==""
	
	gen mother= sex==2 & relate==2
	gsort serial -mother pernum
	gen first_mother=first if mother==1
	gen firstmiddle_mother = firstmiddle if mother==1
	replace first_mother = first_mother[_n-1] if serial==serial[_n-1] & first_mother==""
	replace firstmiddle_mother = firstmiddle_mother[_n-1] if serial==serial[_n-1] & firstmiddle_mother==""
	
	gen firstsoundex = soundex(first)
	gen firstsoundex_father = soundex(first_father)
	gen firstsoundex_mother = soundex(first_mother)
	

	save "$tempdir/`y'_fathers.dta", replace
}





















